# Calculate/load Differential Expression metrics for all genes, load SFARI dataset:
# Gene expression data
load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
# SFARI genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
mutate('ID' = ensembl_gene_id) %>%
dplyr::select(ID, `gene-score`, syndromic)
# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
if(!file.exists('./../working_data/genes_ASD_DE_info_vst.csv')) {
# Calculate differential expression for ASD
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
mutate('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID')
write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info_vst.csv')
rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info_vst.csv')
}
genes_DE_info = genes_DE_info %>% dplyr::select(ID, logFC, AveExpr, t, P.Value, adj.P.Val,
B, `gene-score`, syndromic) %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`))
datExpr_backup= datExpr
rm(to_keep, datProbes)
Number of genes:
nrow(datExpr)
## [1] 49882
Gene count by SFARI score:
table(SFARI_genes$`gene-score`)
##
## 1 2 3 4 5 6
## 24 60 190 448 172 24
Gene count by brain lobe:
table(datMeta$Brain_lobe)
##
## Frontal Temporal Parietal Occipital CBL
## 21 20 22 23 0
Genes with scores 2 and 3 have slightly higher quartiles than genes without a score
ggplotly(genes_DE_info %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
lfc=-1 means no filtering at all, the rest of the filterings include on top of the defined lfc, an adjusted p-value lower than 0.05
lfc_list = c(seq(0, 0.2, 0.05), seq(0.225, 0.675, 0.025), seq(0.7, 1.5, 0.05))
n_genes = nrow(datExpr)
# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)
# Initialice DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=rownames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pca_samps_old = pcas_samps
pca_genes_old = pcas_genes
for(lfc in lfc_list){
# Filter DE genes with iteration's criteria
DE_genes = genes_DE_info %>% filter(adj.P.Val<0.05 & abs(logFC)>lfc)
datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
n_genes = c(n_genes, nrow(DE_genes))
# Calculate PCAs
datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)
# Create new DF entries
pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=DE_genes$ID, 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
# Change PC sign if necessary
if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
if(cor(pca_genes_new$PC1, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1 )<0){
pca_genes_new$PC1 = -pca_genes_new$PC1
}
if(cor(pca_genes_new$PC2, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
pca_genes_new$PC2 = -pca_genes_new$PC2
}
pca_samps_old = pca_samps_new
pca_genes_old = pca_genes_new
# Update DFs
pcas_samps = rbind(pcas_samps, pca_samps_new)
pcas_genes = rbind(pcas_genes, pca_genes_new)
}
# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>%
left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select(ID, PC1, PC2, lfc, Diagnosis_, Brain_lobe)
pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>%
mutate('score'=as.factor(`gene-score`)) %>%
dplyr::select(ID, PC1, PC2, lfc, score)
# Plot change of number of genes
ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) +
geom_point() + geom_line() + theme_minimal() +
ggtitle('Number of remaining genes when modifying filtering threshold'))
rm(datExpr_pca_genes, datExpr_pca_samps, DE_genes, datExpr_DE, pca_genes_new, pca_samps_new,
pca_genes_old, pca_samps_old, lfc_list, lfc)
Note: PC values get smaller as Log2 fold change increases, so on each iteration the values were scaled so it would be easier to compare between frames
ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=lfc, ids=ID)) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
No recognisable pattern
ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=lfc, ids=ID)) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
pcas_sfari_genes = pcas_genes %>% filter(!is.na(score)) %>% dplyr::select(-'score')
complete_sfari_df = expand.grid(unique(pcas_sfari_genes$ID), unique(pcas_sfari_genes$lfc))
colnames(complete_sfari_df) = c('ID', 'lfc')
pcas_sfari_genes = pcas_sfari_genes %>% right_join(complete_sfari_df, by=c('ID','lfc')) %>%
left_join(SFARI_genes, by='ID') %>%
mutate('score'=as.factor(`gene-score`), 'syndromic'=as.factor(syndromic))
pcas_sfari_genes[is.na(pcas_sfari_genes)] = 0 # Fix for ghost points
ggplotly(pcas_sfari_genes %>% ggplot(aes(PC1, PC2, color=score)) +
geom_point(aes(frame=lfc, ids=ID), alpha=0.6) + theme_minimal() +
ggtitle('Genes PCA plot modifying filtering threshold'))
Most of the genes get filtered out by the first adjusted p-value<0.05 filter (including all the genes with score=1 except two), but the proportion of genes left after the first cut is significantly higher for all scores and they seem to be generally filtered out after as well
table(SFARI_genes$`gene-score`[SFARI_genes$ID %in% genes_DE_info$ID[genes_DE_info$adj.P.Val<0.05]])
##
## 1 2 3 4 5 6
## 2 8 33 77 45 6
# Calculate percentage of genes remaining on each lfc by each score
score_count_by_lfc = pcas_genes %>% filter(!is.na(score)) %>% group_by(lfc, score) %>% tally %>% ungroup
score_count_pcnt = score_count_by_lfc %>% filter(lfc==-1) %>% mutate('n_init'=n) %>%
dplyr::select(score, n_init) %>% right_join(score_count_by_lfc, by='score') %>%
mutate('pcnt'=round(n/n_init*100, 2)) %>% filter(lfc!=-1)
# Complete missing entries with zeros
complete_score_count_pcnt = expand.grid(unique(score_count_pcnt$lfc), unique(score_count_pcnt$score))
colnames(complete_score_count_pcnt) = c('lfc', 'score')
score_count_pcnt = full_join(score_count_pcnt, complete_score_count_pcnt, by=c('lfc','score')) %>%
dplyr::select(score, lfc, n, pcnt)
score_count_pcnt[is.na(score_count_pcnt)] = 0
# Join counts by score and all genes
all_count_pcnt = pcas_genes %>% group_by(lfc) %>% tally %>% filter(lfc!=-1) %>%
mutate('pcnt'=round(n/nrow(datExpr)*100, 2), 'score'='All')
score_count_pcnt = rbind(score_count_pcnt, all_count_pcnt)
ggplotly(score_count_pcnt %>% ggplot(aes(lfc, pcnt, color=score)) + geom_point() + geom_line() +
scale_colour_manual(palette=gg_colour_hue) + theme_minimal() +
ggtitle('% of points left after each increase in log2 fold change'))
rm(score_count_by_lfc, complete_score_count_pcnt)
Most syndromic genes get filtered out with the p-value threshold, the remaining ones don’t seem to have a very different behaviour to the rest of the data, perhaps they survive a bit longer.
ggplotly(pcas_sfari_genes %>% ggplot(aes(PC1, PC2, color=ordered(syndromic, levels=c(1,0)))) +
geom_point(aes(frame=lfc, ids=ID), alpha=0.6) + theme_minimal() +
scale_colour_manual(palette=gg_colour_hue) +
ggtitle('Genes PCA plot modifying filtering threshold'))
# Calculate percentage of syndromic genes remaining on each lfc
syndromic_count_by_lfc = pcas_sfari_genes %>% filter(syndromic==1 & PC1!=0) %>% group_by(lfc) %>% tally %>%
ungroup %>% filter(lfc!=-1) %>%
mutate('pcnt' = round(n/nrow(SFARI_genes[SFARI_genes$syndromic==1,])*100,2), 'score'='syndromic')
# Complete missing entires with zeros and add stats for all genes for comparison
syndromic_count_by_lfc = data.frame('lfc' = unique(pcas_genes$lfc), 'score'='syndromic') %>% filter(lfc!=-1) %>%
full_join(syndromic_count_by_lfc, by=c('lfc','score')) %>% replace(.,is.na(.),0) %>%
rbind(all_count_pcnt) %>% mutate('score'=ordered(score, levels=c('syndromic','All')))
ggplotly(syndromic_count_by_lfc %>% ggplot(aes(lfc, pcnt, color=score)) + geom_point() + geom_line() +
scale_colour_manual(palette=gg_colour_hue) + theme_minimal() +
ggtitle('% of points left after each increase in log2 fold change'))
ggplotly(pcas_genes %>% ggplot(aes(PC1, PC2)) + geom_point(aes(frame=lfc, ids=ID, alpha=0.3)) +
theme_minimal() + ggtitle('Genes PCA plot modifying filtering threshold'))